Hierarchical Bayesian Neural Network for Efficient High-dimensional Asset Return Prediction.
Improving efficiency by sharing information between asset specific neural networks via informative priors.
- Data Generation
- Hierarchical Model
- Industry one-hot encoding as feature
- Eigenvectors of the covariance matrix as encodings
- Conclusion
Financial assets exhibit hierarchical structure. They belong to classes such as stocks or bonds, are country specific, belong to sectors and industries, and load on other common risk factors.
Stocks may have different conditional expectations given their sector, for example. To build performant models, these differences need to be modeled.
In the following, a panel data set of asset returns is simulated. By construction, the unconditional expectation (i.e., the expectation ignoring the dependence on sectors) is equal to zero. Hence, a pooled model is unable to fit the data.
There are different ways to fit the model:
- fit pooled data (here: impossible to achieve a good fit.)
- fit data independently per sector (inefficient)
- encode sector as feature and fit pooled data
- use hierarchical model
A Bayesian approach allows each asset specific neural network to deviate from the group model if and only if there is sufficient evidence to conclude that it follows a different data generating process.
In a Bayesian framework we could also model the multiple levels of the hierarchical structure explicitly. The overall net may serve as a prior for sector nets, which, in turn, may serve as priors for asset specific nets.
def mse(y_hat, y):
return ((y_hat - y)**2).mean()
def get_ir(log_rets, Y_hat, inv_cov_tensor):
w = torch.einsum('np, npj -> nj', Y_hat, inv_cov_tensor).detach().numpy()
opt_weights = w / np.abs(w).sum(axis=1)[:, None]
portfolio_ret = np.einsum('ij, ij->i', opt_weights, log_rets)
ir = np.mean(portfolio_ret) / np.std(portfolio_ret) * np.sqrt(252)
return ir, portfolio_ret
def add_daily_trend_to_log_price(log_price, feature_list, beta_list):
"""
This function adds a daily trend to high-frequency simulations.
"""
for beta, feature in zip(beta_list, feature_list):
signal = pd.DataFrame(log_price).copy()
signal.iloc[:, :] = np.nan
for i in set(signal.index.date):
for c in signal.columns:
signal.loc[signal.index.date == i, c] =\
feature[feature.index == i][c].values * beta / n_periods
signal_cumsum = signal.cumsum()
log_price += signal_cumsum
# Hyperparams
n_stocks = 50
n_days = 100
n_ind = 25
pct_train = 0.1
factor_loadings = np.linspace(1, 3, n_stocks)[:, None]
npi = int(n_stocks/n_ind)
industry_loadings = np.zeros((n_stocks, n_ind))
for i in range(n_ind):
industry_loadings[i*npi:(i+1)*npi, i] = np.ones(npi)[:]
# beta
feature_beta = industry_loadings @ np.linspace(-1,1, n_ind)
# np.random.shuffle(feature_beta)
# Use only intraday returns for cov estimation
d = 1
# High-frequency prices are irregularly sampled
# proportion of sampled prices
liq = 1#0.8
# Prices are contaminated with micro structure noise
# 10ct per $100 ms noise
gamma = 0#0.1/100
# mins per day
n_periods = 6.5*60
# GARCH(1,1) spec
sigma_sq_0 = 0.1**2/250/n_periods
garch_alpha = 0#0.019
garch_beta = 0#0.98
omega = sigma_sq_0*(1-garch_alpha-garch_beta)
# preaveraging parameters
k = int(0.5*(n_periods)**(0.5))
pairwise = True
# NERIVE parameters
L = 4
stp = hd._get_partitions(L)
# condition number targeting parameters
max_cond_target = 800
max_iter = 100
step = 0.05
estimators = {
# 'realized_cov': partial(hf.mrc, pairwise=False, theta=0),
# 'msrc': partial(hf.msrc, pairwise=pairwise, M=M, N=N),
'mrc': partial(hf.mrc, pairwise=pairwise, theta=None, k=k),
# 'hy': partial(hf.hayashi_yoshida, theta=None, k=k),
# 'krvm': partial(hf.krvm, H=H, pairwise=pairwise,
# kernel=hf.parzen_kernel),
# 'epic': partial(hf.ensemble, var_weights=np.ones(4)/4,
# cov_weights=np.ones(4)/4),
}
u = sim.Universe(0,
[sigma_sq_0, 0, garch_alpha, garch_beta, omega],
[sigma_sq_0, 0, garch_alpha, garch_beta, omega],
[sigma_sq_0, 0, garch_alpha, garch_beta, omega],
factor_loadings,
industry_loadings,
liq,
gamma,
'm',
)
u.simulate(n_days)
price = u.price
feature_daily = np.random.normal(0, 0.1, n_days*n_stocks).reshape(n_days, n_stocks)
feature2_daily = np.random.normal(0, 0.1, n_days*n_stocks).reshape(n_days, n_stocks)
# add a daily predictable mean to high-frequency data
signal = feature_beta * feature_daily * feature2_daily
signal_df = pd.DataFrame(signal,
index=pd.Series(list(set(price.index.date))).sort_values(),
columns=price.columns)
log_price = np.log(price)
add_daily_trend_to_log_price(log_price, [signal_df], [1.])
price = np.exp(log_price)
price_daily_close = price.resample('1d').last()
price_daily_open = price.resample('1d').first()
log_rets_daily = np.log(price_daily_close) - np.log(price_daily_open)
X = torch.tensor([feature_daily, feature2_daily]).permute(1, 0, 2).float()
Y = torch.tensor(log_rets_daily.values).float()
n_time_steps_train = int(n_days*pct_train)
n_time_steps_valid = n_days - n_time_steps_train
covs_nerive = {key: [] for key in estimators.keys()}
# Compute the integrated covariance matrices.
for i in log_rets_daily.index.date[:n_time_steps_train]:
p = price[(price.index.date < i+timedelta(days=d))&(price.index.date > i-timedelta(days=d))]
l = [hf.get_cumu_demeaned_resid(p.iloc[:, j]) for j in range(price.shape[1])]
for name, estimator in estimators.items():
cov = estimator(l)
# covs[name].append(cov)
if max_cond_target is None:
covs_nerive[name].append(hd.nerive(l, estimator=estimator, stp=stp))
else:
covs_nerive[name].append(hd.linear_shrink_target(
hd.nerive(l, estimator=estimator, stp=stp),
max_cond_target, step, max_iter))
# split into train and valid set
Y_train = Y[:n_time_steps_train, :]
X_train = X[:n_time_steps_train, :, :]
Y_valid = Y[n_time_steps_train:, :]
X_valid = X[n_time_steps_train:, :, :]
# normalize
train_mean = X_train.mean(keepdim=True, dim=(0, 2))
train_std = X_train.std(keepdim=True, dim=(0, 2))
X_train = (X_train - train_mean) / train_std
X_valid = (X_valid - train_mean) / train_std
inv_cov_tensor = torch.tensor([np.linalg.inv(c) for c in u.cond_cov()])[n_time_steps_train:]
The following covariance matrix and dendrogram show the hierarchical structure of asset returns.
cov_avg = np.array(covs_nerive['mrc'])[:n_time_steps_train].mean(0)
corr = hd.to_corr(cov_avg)
sns.clustermap(corr)
from scipy.cluster.hierarchy import ward, dendrogram
linkage_matrix = ward(corr)
fig, ax = plt.subplots(figsize=(10, 10))
ax = dendrogram(linkage_matrix, orientation="right");
plt.tick_params(
axis= 'x',
which='both',
bottom='off',
top='off',
labelbottom='off'
)
By construction the returns are not predictable without conditioning on the sector. The figure below illustrates this.
n = n_time_steps_train
p = n_stocks
x1 = X_train[:,0,:]
x2 = X_train[:,1,:]
y = Y_train
fig = plt.figure()
ax = plt.axes(projection='3d')
ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.set_zlabel('y');
ax.scatter(x1, x2, y, c=y, cmap='plasma', linewidth=0.0);
plt.title("Log-returns without conditioning on sectors.")
plt.show()
x1_train = np.empty((n_ind, X_train.shape[0], npi))
x2_train = np.empty_like(x1_train)
y_train = np.empty_like(x1_train)
x1_valid = np.empty((n_ind, X_valid.shape[0], npi))
x2_valid = np.empty_like(x1_valid)
y_valid = np.empty_like(x1_valid)
for i in range(n_ind):
x1_train[i,:,:] = X_train[:,0,i*npi:(i+1)*npi]
x2_train[i,:,:] = X_train[:,1,i*npi:(i+1)*npi]
y_train[i,:,:] = Y_train[:, i*npi:(i+1)*npi]
x1_valid[i,:,:] = X_valid[:,0,i*npi:(i+1)*npi]
x2_valid[i,:,:] = X_valid[:,1,i*npi:(i+1)*npi]
y_valid[i,:,:] = Y_valid[:, i*npi:(i+1)*npi]
The following figure shows the train set data points per sector. There are not enough observations to fit a seperate neural network to each sector without informative priors.
fig = plt.figure(figsize=(13,13))
fig.suptitle('Train set per sector.', fontsize=20)
for i in range(n_ind):
ax = fig.add_subplot(np.ceil(np.sqrt(n_ind)), np.ceil(np.sqrt(n_ind)),
i+1, projection='3d')
ax.set_xlabel('$x_1$', fontsize=8)
ax.set_ylabel('$x_2$', fontsize=8)
ax.set_zlabel('y', fontsize=8);
ax.tick_params(labelsize=8);
ax.set_xlim(-2, 2)
ax.set_ylim(-2, 2)
ax.set_zlim(-0.01, 0.01)
ax.scatter(x1_train[i,:,:], x2_train[i,:,:], y_train[i,:,:],
c=y_train[i,:,:], cmap='viridis', linewidth=0.0);
plt.show()
The next figure shows the test set data points per sector.
fig = plt.figure(figsize=(13,13))
fig.suptitle('Test set per sector.', fontsize=20)
for i in range(n_ind):
ax = fig.add_subplot(np.ceil(np.sqrt(n_ind)), np.ceil(np.sqrt(n_ind)),
i+1, projection='3d')
ax.set_xlabel('$x_1$', fontsize=8)
ax.set_ylabel('$x_2$', fontsize=8)
ax.set_zlabel('y', fontsize=8);
ax.tick_params(labelsize=8);
ax.set_xlim(-2, 2)
ax.set_ylim(-2, 2)
ax.set_zlim(-0.01, 0.01)
ax.scatter(x1_valid[i,:,:], x2_valid[i,:,:], y_valid[i,:,:],
c=y_valid[i,:,:], cmap='viridis', linewidth=0.0);
plt.show()
# This idea comes from:
# https://twiecki.io/blog/2018/08/13/hierarchical_bayesian_neural_network/.
# Non-centered specification of hierarchical model:
# see https://twiecki.github.io/blog/2017/02/08/bayesian-hierchical-non-centered/
# Why?:
# https://www.youtube.com/watch?v=gSd1msFFZTw
X_train = np.stack((x1_train.reshape(n_ind, npi*n_time_steps_train),
x2_train.reshape(n_ind, npi*n_time_steps_train)), 2)
Y_train = y_train.reshape(n_ind, npi*n_time_steps_train)
ann_input = theano.shared(X_train)
ann_output = theano.shared(Y_train)
n_hidden = [10, 5]
n_data = X_train.shape[2]
n_grps = n_ind
# Initialize random weights between each layer
init_1 = np.random.randn(n_data, n_hidden[0]).astype(theano.config.floatX)
init_2 = np.random.randn(n_hidden[0], n_hidden[1]).astype(theano.config.floatX)
init_out = np.random.randn(n_hidden[1]).astype(theano.config.floatX)
with pm.Model() as neural_network:
# Group mean distribution for input to hidden layer
weights_in_1_grp = pm.Normal('w_in_1_grp', 0, sd=1,
shape=(n_data, n_hidden[0]),
testval=init_1)
# Group standard-deviation
weights_in_1_grp_sd = pm.HalfNormal('w_in_1_grp_sd', sd=1.)
# Group mean distribution for weights from 1st to 2nd layer
weights_1_2_grp = pm.Normal('w_1_2_grp', 0, sd=1,
shape=(n_hidden[0], n_hidden[1]),
testval=init_2)
weights_1_2_grp_sd = pm.HalfNormal('w_1_2_grp_sd', sd=1.)
# Group mean distribution from hidden layer to output
weights_2_out_grp = pm.Normal('w_2_out_grp', 0, sd=1,
shape=(n_hidden[1],),
testval=init_out)
weights_2_out_grp_sd = pm.HalfNormal('w_2_out_grp_sd', sd=1.)
# Separate weights for each different model, just add a 3rd dimension
# of weights
weights_in_1_raw = pm.Normal('w_in_1',
shape=(n_grps, n_data, n_hidden[0]))
weights_in_1 = weights_in_1_raw * weights_in_1_grp_sd + weights_in_1_grp
weights_1_2_raw = pm.Normal('w_1_2',
shape=(n_grps, n_hidden[0], n_hidden[1]))
weights_1_2 = weights_1_2_raw * weights_1_2_grp_sd + weights_1_2_grp
weights_2_out_raw = pm.Normal('w_2_out',
shape=(n_grps, n_hidden[1]))
weights_2_out = weights_2_out_raw * weights_2_out_grp_sd + weights_2_out_grp
# Build neural-network using relu activation function
act_1 = tt.nnet.relu(tt.batched_dot(ann_input,
weights_in_1))
act_2 = tt.nnet.relu(tt.batched_dot(act_1,
weights_1_2))
# linear output layer
intercept = pm.Normal('intercept', mu=0, sd=10)
act_out = tt.batched_dot(act_2, weights_2_out) + intercept
sigma = pm.HalfNormal('sigma', sd=10)
out = pm.Normal('out',
act_out,
sigma,
observed=ann_output)
with neural_network:
trace = pm.sample(init='advi+adapt_diag', n_init=200000, tune=50, chains=1,
nuts_kwargs={'target_accept': 0.9},)
ppc = pm.sample_posterior_predictive(trace, model=neural_network, samples=1000)
y_pred = ppc['out'].mean(0)
mse(y_pred, Y_train)
X_valid = np.stack((x1_valid.reshape(n_ind, npi*n_time_steps_valid),
x2_valid.reshape(n_ind, npi*n_time_steps_valid)), 2)
Y_valid = y_valid.reshape(n_ind, npi*n_time_steps_valid)
ann_input.set_value(X_valid)
ann_output.set_value(Y_valid)
ppc = pm.sample_posterior_predictive(trace, model=neural_network, samples=1000)
y_hat_valid = ppc['out'].mean(0)
mse(y_hat_valid, Y_valid)
The figure below shows the predictions (green) and targets (blue) per sector for the hierarchical model.
fig = plt.figure(figsize=(13,13))
fig.suptitle('Predictions per sector.', fontsize=20)
for i in range(n_ind):
ax = fig.add_subplot(np.ceil(np.sqrt(n_ind)), np.ceil(np.sqrt(n_ind)),
i+1, projection='3d')
ax.set_xlabel('$x_1$', fontsize=8)
ax.set_ylabel('$x_2$', fontsize=8)
ax.set_zlabel('y', fontsize=8);
ax.tick_params(labelsize=8);
ax.set_xlim(-2, 2)
ax.set_ylim(-2, 2)
ax.set_zlim(-0.01, 0.01)
ax.scatter(x1_valid[i,:,:], x2_valid[i,:,:], y_hat_valid[i,:],
#c=y_pred[i,:],
#cmap='Greens',
c='green',
linewidth=0.0);
ax.scatter(x1_valid[i,:,:], x2_valid[i,:,:], y_valid[i,:,:],
#c=y_valid[i,:,:],
#cmap='gray',
c='blue',
linewidth=0.0);
plt.show()
ir, rets = get_ir(log_rets=torch.tensor(Y_valid.reshape(n_stocks, n_time_steps_valid)).T,
Y_hat=torch.tensor(y_hat_valid.reshape(n_stocks, n_time_steps_valid)).T,
inv_cov_tensor=inv_cov_tensor)
print('Test set information ratio:', ir)
plt.plot(np.cumsum(rets))
plt.title('Test set cumulative portfolio log-returns')
plt.xlabel('days')
plt.ylabel('cumulative portfolio log-returns')
X_train = np.stack((x1_train.reshape(n_ind*npi*n_time_steps_train),
x2_train.reshape(n_ind*npi*n_time_steps_train)), 1)
# Comment the following line fit pooled model without industry feature
X_train = np.column_stack((X_train,
np.repeat(industry_loadings, n_time_steps_train, 0)))
Y_train = y_train.reshape(n_ind*npi*n_time_steps_train)
ann_input = theano.shared(X_train)
ann_output = theano.shared(Y_train)
n_hidden = [10, 5]
n_data = X_train.shape[1]
# Initialize random weights between each layer
init_1 = np.random.randn(n_data, n_hidden[0]).astype(theano.config.floatX)
init_2 = np.random.randn(n_hidden[0], n_hidden[1]).astype(theano.config.floatX)
init_out = np.random.randn(n_hidden[1]).astype(theano.config.floatX)
with pm.Model() as neural_network:
weights_in_1 = pm.Normal('w_in_1_grp', 0, sd=1,
shape=(n_data, n_hidden[0]),
testval=init_1)
weights_1_2 = pm.Normal('w_1_2_grp', 0, sd=1,
shape=(n_hidden[0], n_hidden[1]),
testval=init_2)
weights_2_out = pm.Normal('w_2_out_grp', 0, sd=1,
shape=(n_hidden[1],),
testval=init_out)
# Build neural-network using relu activation function
act_1 = tt.nnet.relu(tt.dot(ann_input,
weights_in_1))
act_2 = tt.nnet.relu(tt.dot(act_1,
weights_1_2))
# linear output layer
intercept = pm.Normal('intercept', mu=0, sd=10)
act_out = tt.dot(act_2, weights_2_out) + intercept
sigma = pm.HalfNormal('sigma', sd=10)
out = pm.Normal('out',
act_out,
sigma,
observed=ann_output)
with neural_network:
trace = pm.sample(
init='advi+adapt_diag', tune=50, chains=1, n_init=200000,
nuts_kwargs={'target_accept': 0.9},)
ppc = pm.sample_posterior_predictive(trace, model=neural_network, samples=1000)
y_hat = ppc['out'].mean(0).reshape(n_ind, npi*n_time_steps_train)
mse(y_hat, Y_train.reshape(n_ind, npi*n_time_steps_train))
X_valid = np.stack((x1_valid.reshape(n_ind*npi*n_time_steps_valid),
x2_valid.reshape(n_ind*npi*n_time_steps_valid)), 1)
X_valid = np.column_stack((X_valid, np.repeat(industry_loadings, n_time_steps_valid, 0)))
Y_valid = y_valid.reshape(n_ind*npi*n_time_steps_valid)
ann_input.set_value(X_valid)
ann_output.set_value(Y_valid)
ppc = pm.sample_posterior_predictive(trace, model=neural_network, samples=1000)
y_hat_valid = ppc['out'].mean(0).reshape(n_ind, npi*n_time_steps_valid)
mse(y_hat_valid, Y_valid.reshape(n_ind, npi*n_time_steps_valid))
The figure below shows the predictions (green) and targets (blue) per sector for the standard model.
fig = plt.figure(figsize=(13,13))
fig.suptitle('Predictions per sector.', fontsize=20)
for i in range(n_ind):
ax = fig.add_subplot(np.ceil(np.sqrt(n_ind)), np.ceil(np.sqrt(n_ind)),
i+1, projection='3d')
ax.set_xlabel('$x_1$', fontsize=8)
ax.set_ylabel('$x_2$', fontsize=8)
ax.set_zlabel('y', fontsize=8);
ax.tick_params(labelsize=8);
ax.set_xlim(-2, 2)
ax.set_ylim(-2, 2)
ax.set_zlim(-0.01, 0.01)
ax.scatter(x1_valid[i,:,:], x2_valid[i,:,:], y_hat_valid[i,:],
#c=y_pred[i,:],
#cmap='Greens',
c='green',
linewidth=0.0);
ax.scatter(x1_valid[i,:,:], x2_valid[i,:,:], y_valid[i,:,:],
#c=y_valid[i,:,:],
#cmap='gray',
c='blue',
linewidth=0.0);
plt.show()
ir, rets = get_ir(log_rets=torch.tensor(Y_valid.reshape(n_stocks, n_time_steps_valid)).T,
Y_hat=torch.tensor(y_hat_valid.reshape(n_stocks, n_time_steps_valid)).T,
inv_cov_tensor=inv_cov_tensor)
print('Test set information ratio:', ir)
plt.plot(np.cumsum(rets))
plt.title('Test set cumulative portfolio log-returns')
plt.xlabel('days')
plt.ylabel('cumulative portfolio log-returns')
X_train = np.stack((x1_train.reshape(n_ind*npi*n_time_steps_train),
x2_train.reshape(n_ind*npi*n_time_steps_train)), 1)
# number of principal components, 0 if all.
n_pc = 0
evectors = np.linalg.eigh(cov_avg)[1][:, -n_pc:]
# evectors = np.linalg.eigh(u.uncond_cov())[1][:, -n_pc:]
X_train = np.column_stack((X_train,
np.repeat(evectors,
n_time_steps_train, 0)))
Y_train = y_train.reshape(n_ind*npi*n_time_steps_train)
ann_input = theano.shared(X_train)
ann_output = theano.shared(Y_train)
n_hidden = [10, 5]
n_data = X_train.shape[1]
# Initialize random weights between each layer
init_1 = np.random.randn(n_data, n_hidden[0]).astype(theano.config.floatX)
init_2 = np.random.randn(n_hidden[0], n_hidden[1]).astype(theano.config.floatX)
init_out = np.random.randn(n_hidden[1]).astype(theano.config.floatX)
with pm.Model() as neural_network:
weights_in_1 = pm.Normal('w_in_1_grp', 0, sd=1,
shape=(n_data, n_hidden[0]),
testval=init_1)
weights_1_2 = pm.Normal('w_1_2_grp', 0, sd=1,
shape=(n_hidden[0], n_hidden[1]),
testval=init_2)
weights_2_out = pm.Normal('w_2_out_grp', 0, sd=1,
shape=(n_hidden[1],),
testval=init_out)
# Build neural-network using relu activation function
act_1 = tt.nnet.relu(tt.dot(ann_input,
weights_in_1))
act_2 = tt.nnet.relu(tt.dot(act_1,
weights_1_2))
# linear output layer
intercept = pm.Normal('intercept', mu=0, sd=10)
act_out = tt.dot(act_2, weights_2_out) + intercept
sigma = pm.HalfNormal('sigma', sd=10)
out = pm.Normal('out',
act_out,
sigma,
observed=ann_output)
with neural_network:
trace = pm.sample(
init='advi+adapt_diag', tune=50, chains=1, n_init=200000,
nuts_kwargs={'target_accept': 0.9},)
ppc = pm.sample_posterior_predictive(trace, model=neural_network, samples=1000)
y_hat = ppc['out'].mean(0).reshape(n_ind, npi*n_time_steps_train)
mse(y_hat, Y_train.reshape(n_ind, npi*n_time_steps_train))
X_valid = np.stack((x1_valid.reshape(n_ind*npi*n_time_steps_valid),
x2_valid.reshape(n_ind*npi*n_time_steps_valid)), 1)
X_valid = np.column_stack((X_valid, np.repeat(evectors, n_time_steps_valid, 0)))
Y_valid = y_valid.reshape(n_ind*npi*n_time_steps_valid)
ann_input.set_value(X_valid)
ann_output.set_value(Y_valid)
ppc = pm.sample_posterior_predictive(trace, model=neural_network, samples=1000)
y_hat_valid = ppc['out'].mean(0).reshape(n_ind, npi*n_time_steps_valid)
mse(y_hat_valid, Y_valid.reshape(n_ind, npi*n_time_steps_valid))
ir, rets = get_ir(log_rets=torch.tensor(Y_valid.reshape(n_stocks, n_time_steps_valid)).T,
Y_hat=torch.tensor(y_hat_valid.reshape(n_stocks, n_time_steps_valid)).T,
inv_cov_tensor=inv_cov_tensor)
print('Test set information ratio:', ir)
plt.plot(np.cumsum(rets))
plt.title('Test set cumulative portfolio log-returns')
plt.xlabel('days')
plt.ylabel('cumulative portfolio log-returns')
Conclusion
Hierarchical neural networks can successfully model panel data of asset returns with sector dependent data generating functions by sharing information via informative priors. Still, standard neural networks are surprisingly good at capturing the sector differences as well if the sector is given as a one-hot encoded matrix.